[Autogluon] Graph_auto시도(안되넹..)

Author

김보람

Published

January 17, 2024

imports

import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, accuracy_score

import networkx as nx
# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns


def throw(df, percentage):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-percentage)) / (len(df0) * percentage)
    df0_down = df0.sample(frac=df0_downsample, random_state=42)
    df_p = pd.concat([df1, df0_down])
    return df_p
df = throw(fraudTrain, 0.5)

autogluon

A. 데이터

def bipartite(df, node_1, node_2, graph_type=nx.Graph()):
    df=df.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df[node_1].values.tolist()+\
                                                      df[node_2].values.tolist()))}
    
    df["from"]=df[node_1].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df[node_2].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")   
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") 

    return G




G  = bipartite(df, node_1 = 'cc_num', node_2 = 'merchant')


def G_split(G, test_size):
    train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G.edges))), 
                                                                      list(nx.get_edge_attributes(G, "label").values()), 
                                                                      test_size=test_size, 
                                                                      random_state=42)
    edgs = list(G.edges)
    
    train_graph = G.edge_subgraph([edgs[x] for x in train_edges]).copy()
    train_graph.add_nodes_from(list(set(G.nodes) - set(train_graph.nodes)))
    test_graph = G.edge_subgraph([edgs[x] for x in test_edges]).copy()
    test_graph.add_nodes_from(list(set(G.nodes) - set(test_graph.nodes)))
    
    return train_graph, test_graph
train_graph, test_graph = G_split(G, test_size=0.2)
    train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G.edges))), 
                                                                      list(nx.get_edge_attributes(G, "label").values()), 
                                                                      test_size=0.2, 
                                                                      random_state=42)
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
edgs = list(G.edges)
node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 
    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.59it/s]
tr = TabularDataset(train_embeddings)

B. predictor 생성

predictr = TabularPredictor(train_labels)
No path specified. Models will be saved in: "AutogluonModels/ag-20240117_122745/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')
Presets specified: ['best_quality']
AttributeError: 'TabularDataset' object has no attribute 'unique'
predictr.leaderboard()
                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.894772       0.020896   4.365760                0.009703           2.119462            2       True         14
1           CatBoost_BAG_L1   0.894661       0.004997   1.386780                0.004997           1.386780            1       True          7
2            XGBoost_BAG_L1   0.894439       0.025061   0.600274                0.025061           0.600274            1       True         11
3      LightGBMLarge_BAG_L1   0.894106       0.006197   0.859518                0.006197           0.859518            1       True         13
4           LightGBM_BAG_L1   0.893995       0.015738   0.650386                0.015738           0.650386            1       True          4
5     NeuralNetTorch_BAG_L1   0.888778       0.050014  14.929281                0.050014          14.929281            1       True         12
6         LightGBMXT_BAG_L1   0.885004       0.030494   0.456313                0.030494           0.456313            1       True          3
7     KNeighborsUnif_BAG_L1   0.878233       0.011929   0.005328                0.011929           0.005328            1       True          1
8    NeuralNetFastAI_BAG_L1   0.867022       0.089430   7.351443                0.089430           7.351443            1       True         10
9     KNeighborsDist_BAG_L1   0.864136       0.009754   0.004292                0.009754           0.004292            1       True          2
10    ExtraTreesEntr_BAG_L1   0.862582       0.211025   0.299140                0.211025           0.299140            1       True          9
11    ExtraTreesGini_BAG_L1   0.862249       0.203468   0.341149                0.203468           0.341149            1       True          8
12  RandomForestEntr_BAG_L1   0.856033       0.185369   0.526263                0.185369           0.526263            1       True          6
13  RandomForestGini_BAG_L1   0.856033       0.190420   0.333284                0.190420           0.333284            1       True          5
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.894772 0.020896 4.365760 0.009703 2.119462 2 True 14
1 CatBoost_BAG_L1 0.894661 0.004997 1.386780 0.004997 1.386780 1 True 7
2 XGBoost_BAG_L1 0.894439 0.025061 0.600274 0.025061 0.600274 1 True 11
3 LightGBMLarge_BAG_L1 0.894106 0.006197 0.859518 0.006197 0.859518 1 True 13
4 LightGBM_BAG_L1 0.893995 0.015738 0.650386 0.015738 0.650386 1 True 4
5 NeuralNetTorch_BAG_L1 0.888778 0.050014 14.929281 0.050014 14.929281 1 True 12
6 LightGBMXT_BAG_L1 0.885004 0.030494 0.456313 0.030494 0.456313 1 True 3
7 KNeighborsUnif_BAG_L1 0.878233 0.011929 0.005328 0.011929 0.005328 1 True 1
8 NeuralNetFastAI_BAG_L1 0.867022 0.089430 7.351443 0.089430 7.351443 1 True 10
9 KNeighborsDist_BAG_L1 0.864136 0.009754 0.004292 0.009754 0.004292 1 True 2
10 ExtraTreesEntr_BAG_L1 0.862582 0.211025 0.299140 0.211025 0.299140 1 True 9
11 ExtraTreesGini_BAG_L1 0.862249 0.203468 0.341149 0.203468 0.341149 1 True 8
12 RandomForestEntr_BAG_L1 0.856033 0.185369 0.526263 0.185369 0.526263 1 True 6
13 RandomForestGini_BAG_L1 0.856033 0.190420 0.333284 0.190420 0.333284 1 True 5

D. 예측(predict)

(tr.is_fraud == predictr.predict(tr)).mean()
0.8967698967698968
(tst.is_fraud == predictr.predict(tst)).mean()
0.9021908791725435